Dependencies

library(tidyverse)
## ── Attaching packages ───────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2     ✓ purrr   0.3.4
## ✓ tibble  3.0.3     ✓ dplyr   1.0.2
## ✓ tidyr   1.1.2     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.5.0
## ── Conflicts ──────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
library(readr)
library(viridis)
## Loading required package: viridisLite
library(e1071)
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift

Data

25x

load("../data/25x/Coleps_viridis/Morph_mvt.RData")
dd25 <- morph_mvt

load("../data/25x/Coleps_irchel/Morph_mvt.RData")
dd25 <- rbind(dd25, morph_mvt)

load("../data/25x/Colpidium/Morph_mvt.RData")
morph_mvt$species <- "Colpidium"
dd25 <- rbind(dd25, morph_mvt)

load("../data/25x/Didinium/Morph_mvt.RData")
dd25 <- rbind(dd25, morph_mvt)

load("../data/25x/Euplotes/Morph_mvt.RData")
# morph_mvt <- morph_mvt %>% filter(mean_area>3000)
dd25 <- rbind(dd25, morph_mvt)

load("../data/25x/Euplotes_second/Morph_mvt.RData")
# morph_mvt <- morph_mvt %>% filter(mean_area>3000)
dd25 <- rbind(dd25, morph_mvt)

load("../data/25x/Paramecium_bursaria/Morph_mvt.RData")
dd25 <- rbind(dd25, morph_mvt)

load("../data/25x/Paramecium_caudatum/Morph_mvt.RData")
dd25 <- rbind(dd25, morph_mvt)

# load("../data/25x/Rotifer/Morph_mvt.RData")
# dd25 <- rbind(dd25, morph_mvt)

load("../data/25x/Stylonychia_1/Morph_mvt.RData")
morph_mvt$species <- "Stylonychia1"
dd25 <- rbind(dd25, morph_mvt)

load("../data/25x/Stylonychia_1_second/Morph_mvt.RData")
morph_mvt$species <- "Stylonychia1"
dd25 <- rbind(dd25, morph_mvt)

load("../data/25x/Stylonychia_2/Morph_mvt.RData")
morph_mvt$species <- "Stylonychia2"
dd25 <- rbind(dd25, morph_mvt)

load("../data/25x/Dexiostoma/Morph_mvt.RData")
dd25 <- rbind(dd25, morph_mvt)

load("../data/25x/Loxocephallus/Morph_mvt.RData")
dd25 <- rbind(dd25, morph_mvt)

load("../data/25x/Tetrahymena/Morph_mvt.RData")
morph_mvt$species <- "Tetrahymena"
dd25 <- rbind(dd25, morph_mvt)

load("../data/25x/Cryptomonas/Morph_mvt.RData")
morph_mvt$species <- "Cryptomonas"
dd25 <- rbind(dd25, morph_mvt)

# load("../data/25x/Tetrahymena_mixed//Morph_mvt.RData")
# morph_mvt$species <- "Tetrahymena"
# morph_mvt <- morph_mvt %>% filter(mean_area<1000)
# dd25 <- full_join(dd25, morph_mvt)

load("../data/25x/Debris/Morph_mvt.RData")
morph_mvt$species <- "Debris_and_other"
morph_mvt <- morph_mvt %>% filter(mean_area<700)
dd25 <- full_join(dd25, morph_mvt)
## Joining, by = c("file", "mean_grey", "sd_grey", "mean_area", "sd_area", "mean_perimeter", "sd_perimeter", "mean_major", "sd_major", "mean_minor", "sd_minor", "mean_ar", "sd_ar", "mean_turning", "sd_turning", "duration", "N_frames", "max_net", "net_disp", "net_speed", "gross_disp", "gross_speed", "max_step", "min_step", "sd_step", "sd_gross_speed", "max_gross_speed", "min_gross_speed", "id", "date", "species", "sample", "video")
## filtering

dd25 <- dd25 %>%
  filter(net_disp>50, net_speed>5)%>%
  select(-edible_algae,-microcosm.nr)

dd25$magnification <- 2.5

Species compositions

species <- unique(dd25$species)
species <- species[!is.element(species,c("Debris_and_other","Cryptomonas"))]
compositions <- read_csv("../../compositions.csv")
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   composition = col_character()
## )
## See spec(...) for full column specifications.
compositions <- compositions[,species]

compositions_list <- apply(compositions, 1, function(x){
  idx <- which(x==1)
  names(idx)
})

names(compositions_list) <- paste0("c_",c(paste0(0,1:9),10:16))

Classifiers

classifier_plot_svm <- function(table, combination.nr){
  # cm <- classifier$confusion
  cm <- table
  ncol <- ncol(cm)
  cm <- apply(cm, 1, function(x) x/sum(x))
  
  cm_long <- data.frame(Predicted = factor(rep(rownames(cm),ncol), levels = rownames(cm)),
                        Truth = factor(rep(colnames(cm), each=ncol), levels = colnames(cm)),
                        Classification = as.vector(cm))
  
  plot <- cm_long %>%
    ggplot(aes(Predicted,Truth,fill=Classification))+
    geom_tile() +
    geom_text(aes(label = round(Classification, 3)), col="white") +
    scale_x_discrete(position = "top") +
    scale_y_discrete(limits = rev(levels(cm_long$Truth))) +
    scale_fill_viridis(option = "D", end=.8, discrete=FALSE)+
    theme(axis.text.x = element_text(angle = 90, hjust = 0))+
    theme(legend.text = element_text(size=14),
          axis.title = element_text(size=14),
          title = element_text(size=16),
          axis.text = element_text(size=13))+
    labs(title=paste("PPV of",combination.nr),fill="PPV")
  
  return(plot)
}


classifier_assessment_plot <- function(cf, combination.nr){

  cf.df <- cf$byClass[,1:4] %>%
    as.data.frame() %>%
    mutate(Species = gsub("Class: ", "", rownames(cf$byClass[,1:5]))) %>%
    rename("NPV" = "Neg Pred Value", "PPV" = "Pos Pred Value",
           "Sens" = "Sensitivity", "Spec" = "Specificity") %>%
    pivot_longer(cols = 1:4) %>%
    mutate(col = ifelse(value>=0.9,"1",
                        ifelse(value<0.9 & value>=0.8,"2",
                               ifelse(value<0.8 & value>=0.7,"3","4"))))

  plot <- cf.df %>%
    ggplot(aes(name,Species,fill=col))+
    geom_tile() +
    geom_text(aes(label = round(value, 3)), col="white") +
    scale_x_discrete(position = "top") +
    scale_y_discrete(limits = rev(levels(as.factor(cf.df$Species)))) +
    scale_fill_manual(values = c("#006837","#66bd63","#f46d43","#a50026"), breaks = c("1","2","3","4")) +
    theme(legend.text = element_text(size=14),
          title = element_text(size = 16),
          axis.title = element_blank(),
          axis.text = element_text(size=13),
          legend.position = "none")+
    labs(title=combination.nr, fill="")
  
  return(plot)
}
formula <- factor(species) ~ mean_area + sd_area + mean_perimeter + sd_perimeter + mean_major +
        sd_major +  mean_ar + sd_ar +
        mean_turning + sd_turning + gross_disp + max_net + net_disp + net_speed + max_step +
        min_step + sd_step + sd_gross_speed + max_gross_speed + min_gross_speed

trainingdata <- dd25[complete.cases(dd25), ]

set.seed(624)

trainingdata$species <- factor(trainingdata$species)
split_up <- split(trainingdata, f = trainingdata$species)
subsamples <- lapply(split_up, function(x) {
  x %>% sample_n(ifelse(nrow(x) < 500, nrow(x), 500))})
trainingdata <- do.call("rbind", subsamples)
# table(sub_trainingdata$species)

# A stratified random split of the data
idx_train <- createDataPartition(trainingdata$species,
                                 p = 0.7, # percentage of data as training
                                 list = FALSE)

testdata <- trainingdata[-idx_train,]
trainingdata <- trainingdata[idx_train,]
  
obj <- tune(svm, formula, data = trainingdata, kernel="radial",
            ranges = list(gamma = 2^(-8:2), cost = 2^(1:10)), probability = TRUE,
            tunecontrol = tune.control(sampling = "fix", best.model = T))


cf <- caret::confusionMatrix(predict(obj$best.model, newdata=testdata %>% select(-species)),
                             testdata$species)

classifier_plot_svm(table = cf$table,
                    combination.nr = "all")

classifier_assessment_plot(cf = cf,
                           combination.nr = "all")

There are mainly 4 measures:

PPV is the most important for us!

formula <- factor(species) ~ mean_area + sd_area + mean_perimeter + sd_perimeter + mean_major +
        sd_major +  mean_ar + sd_ar +
        mean_turning + sd_turning + gross_disp + max_net + net_disp + net_speed + max_step +
        min_step + sd_step + sd_gross_speed + max_gross_speed + min_gross_speed

set.seed(62)
classifiers_18c_25x <- list()
classifiers_18c_25x_plots <- list()
classifiers_18c_25x_assess_plots <- list()

trainingdata <- dd25[complete.cases(dd25), ]

for(i in 1:length(compositions_list)){
  sub_trainingdata <- trainingdata %>%
    filter(species %in% c(compositions_list[[i]],"Cryptomonas","Debris_and_other"))
  n.var <- length(unique(sub_trainingdata$species))
  sub_trainingdata$species <- factor(sub_trainingdata$species)

  split_up <- split(sub_trainingdata, f = sub_trainingdata$species)
  subsamples <- lapply(split_up, function(x) {
    x %>% sample_n(ifelse(nrow(x) < 500, nrow(x), 500))})
  sub_trainingdata <- do.call("rbind", subsamples)
  # table(sub_trainingdata$species)
  # A stratified random split of the data
  idx_train <- createDataPartition(sub_trainingdata$species,
                                   p = 0.7, # percentage of data as training
                                   list = FALSE)
  
  sub_testdata <- sub_trainingdata[-idx_train,]
  sub_trainingdata <- sub_trainingdata[idx_train,]
  
  n.min <- min(table(sub_trainingdata$species))
  
  obj <- tune(svm, formula, data = sub_trainingdata, kernel="radial",
            ranges = list(gamma = 2^(-8:2), cost = 2^(1:10)), probability = TRUE,
            tunecontrol = tune.control(sampling = "fix", best.model = T))
  
  classifiers_18c_25x[[i]] <- obj$best.model
  
  cf <- caret::confusionMatrix(predict(obj$best.model, newdata=sub_testdata %>% select(-species)),
                       sub_testdata$species)
  
  classifiers_18c_25x_plots[[i]] <- classifier_plot_svm(table = cf$table,
                                                        combination.nr = names(compositions_list)[[i]])
  
  classifiers_18c_25x_assess_plots[[i]] <- classifier_assessment_plot(cf, names(compositions_list)[[i]])
}

names(classifiers_18c_25x_plots) <- names(classifiers_18c_25x) <-
  names(classifiers_18c_25x_assess_plots) <- paste0("c_",c(paste0(0,1:9),10:16))
library(patchwork)
for(i in 1:16){
  print(classifiers_18c_25x_plots[[i]] + classifiers_18c_25x_assess_plots[[i]] + 
    plot_layout(widths = c(4,2)))
}

Save classifiers

saveRDS(classifiers_18c_25x, "svm_video_classifiers_18c_25x.rds")
# cl <- readRDS("classifiers_18c_25x.rds")